library(tidyverse) #tidy data
library(dplyr) #tidy data 
library(Seurat) #scRNAseq infrastructure
library(zeallot) #assignment of pipe
library(magrittr) #assignment of pipe
library(ggplot2) #plots
library(scales) #plots
library(ggthemes) #plots
library(patchwork) #plots

#set up root paths
root <- "./"
##root path for intermediate data
inter_data <- paste0(root, "data/interdata/analysis_v1/")
# system(paste0("mkdir -p ", inter_data))
##root path for deliverables
del_root <- paste0(root, "results/deliverable/analysis_v1/")
# system(paste0("mkdir -p ", del_root))
##root path for plots
plot_root <- paste0(root, "results/plots/analysis_v1/")
# system(paste0("mkdir -p ", plot_root))

rm(root)

Sample information

sample_sheet <- readxl::read_xlsx(path = "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/data/related_files/MOLNG-3814-experimental-variables-template.xlsx",
                                  sheet = 1)
sample_sheet$'2' <- NULL

sample_sheet$`Sample ID` <- gsub(pattern = "S",
                                 replacement = "L",
                                 sample_sheet$`Sample ID`)

sample_sheet_k <- sample_sheet[sample_sheet$`Sample ID` %in% c("L64220", "L64221", "L64997", "L64998", "L65101", "L65102"), ]

knitr::kable(sample_sheet_k)
Sample ID Sample Name Sample Type
L64220 Surface_fish_ovary_1 Wild Type
L64221 Surface_fish_ovary_2 Wild Type
L64997 New_Pachon_ovary_1 Wild Type
L64998 New_Pachon_ovary_2 Wild Type
L65101 New_Molino_ovary_1 Wild Type
L65102 New_Molino_ovary_2 Wild Type


Quality control (QC)

Ambient RNA correction with CellBender 0.3.0

According to 10X Genomics guide, all samples need ambient RNA correction because all the knee plots lack characteristic steep cliff.


#tutorial: https://cellbender.readthedocs.io/en/latest/usage/index.html

#conda environment installed as cellbender

#tested one sample, it worked
#now we will submit a slurm job to run all samples in parallel 

#generate a sample sheet (we only need a path, then cd, then run it there)
paths <- dir(path = "/n/core/Bioinformatics/secondary/Rohner/fx2482/MOLNG-3814.Astyanax_mexicanus-2.0.Ens_110/",
             pattern = "outs",
             recursive = TRUE,
             include.dirs = TRUE,
             full.names = TRUE)
paths %<>%
  as.data.frame()

write.table(paths,
            sep = " ",
            file = paste0("/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/code/CEREBRO_related/sample_sheet_cellbender.txt"),
            quote = FALSE,
            row.names = FALSE, col.names = FALSE)
#!/bin/bash
#SBATCH --partition gpu
#SBATCH --gres=gpu:a100:1
#SBATCH --nodes=1 
#SBATCH --ntasks=1 
#SBATCH --mem=5GB
#SBATCH --job-name=cellbender
#SBATCH --array=1-6
#SBATCH --output=array_job_%A_%a.out
#SBATCH --error=array_job_%A_%a.err

. ~/anaconda3/etc/profile.d/conda.sh

conda activate cellbender

paths=$(cat /n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/code/CEREBRO_related/sample_sheet_cellbender.txt|\
         awk -v var=$SLURM_ARRAY_TASK_ID 'NR==var {print $1}')

cd ${paths}

cellbender remove-background \
      --cuda \
      --input raw_feature_bc_matrix.h5 \
      --output output.h5
#!/bin/bash
#SBATCH --nodes=1 
#SBATCH --ntasks=1 
#SBATCH --mem=5GB
#SBATCH --job-name=cellbender
#SBATCH --array=1-6
#SBATCH --output=array_job_%A_%a.out
#SBATCH --error=array_job_%A_%a.err

. ~/anaconda3/etc/profile.d/conda.sh

conda activate cellbender

paths=$(cat /n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/code/CEREBRO_related/sample_sheet_cellbender.txt|\
         awk -v var=$SLURM_ARRAY_TASK_ID 'NR==var {print $1}')

cd ${paths}

ptrepack --complevel 5 output_filtered.h5:/matrix output_filtered_seurat.h5:/matrix

Detection-based quality control (QC)

QC visualization

Visualization of nFeature and nCount

## ===== Generate Seurat object ===== ##
paths <- list.files(path = "/n/core/Bioinformatics/secondary/Rohner/fx2482/MOLNG-3814.Astyanax_mexicanus-2.0.Ens_110/",
                    full.names = FALSE,
                    recursive = TRUE,
                    pattern = "output_filtered_seurat.h5")

samples <- gsub(pattern = "\\/outs\\/output_filtered_seurat.h5",
                replacement = "",
                paths)

ss <- cbind.data.frame("sample" = samples,
                       "path" = paste0("/n/core/Bioinformatics/secondary/Rohner/fx2482/MOLNG-3814.Astyanax_mexicanus-2.0.Ens_110/", paths))

#start here, replace lib names with sample names
ds <- readxl::read_xlsx(path = "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/data/related_files/MOLNG-3814-experimental-variables-template.xlsx",
                      sheet = 1)
colnames(ds)[1:2] <- c("lib_id", "sample_name")
ds$lib_id <- gsub(pattern = "S",
                  replacement = "L",
                  ds$lib_id)
ds_k <- ds[ds$lib_id %in% samples, ]#keep the samples that worked only
ds_k$sample_name <- gsub(pattern = "New_",
                         replacement = "",
                         ds_k$sample_name)
ds_k$sample_name <- gsub(pattern = "ovary_",
                         replacement = "",
                         ds_k$sample_name)

rm(ds)

ss_j <- left_join(x = ss,
                  y = ds_k %>%
                    select(lib_id, sample_name),
                  by = join_by(sample == lib_id))
colnames(ss_j)[1:2] <- c("Library_ID", "Paths")

#read in seurat h5 files
seu.list <- list()
for (i in seq_along(ss_j$Library_ID)){
  data.list <- Read10X_h5(filename = ss_j$Paths[i],
                          use.names = TRUE)

  seu.list[[i]] <- CreateSeuratObject(counts = data.list,
                                      project = ss_j$sample_name[i])
  seu.list[[i]]$sample_name <- ss_j$sample_name[i]
  seu.list[[i]]$lib_ID <- ss_j$Library_ID[i]
}
names(seu.list) <- ss_j$sample_name

#use nCount>500 as real cells like 10x did to keep real cells only
seu.list <- lapply(X = seu.list,
                   FUN = function(x){
                     x <- subset(x, subset = nCount_RNA > 500)
                   })

rm(data.list, ds_k, ss, ss_j, i, paths, samples)

save(seu.list, file = paste0(inter_data, "seuratobj.RData"))
#QC
##we cannot calculate percentage of mitochondrial genes and ribosomal genes due to lack of annotation

#plot QC
##merge for visualization
seu_m <- merge(seu.list[[1]], y = seu.list[2:length(seu.list)],
               add.cell.ids = names(seu.list))

#change the sequence of samples
seu_m$sample_name <- factor(seu_m$sample_name,
                            levels = unique(seu_m$sample_name))

pv1 <- VlnPlot(seu_m,
               features = c("nFeature_RNA"),
               pt.size = 0,
               cols = nord::nord_palettes$afternoon_prarie[3:9],
               y.max = 10000,
               ncol = 1,
               group.by = "sample_name") +
  NoLegend() +
  xlab(label = "")
pv2 <- VlnPlot(seu_m,
               features = c("nCount_RNA"),
               pt.size = 0,
               cols = nord::nord_palettes$afternoon_prarie[3:9],
               y.max = 125000,
               ncol = 1,
               group.by = "sample_name") +
  NoLegend() +
  xlab(label = "")

png(filename = paste0(plot_root, "All_cells_qc.png"), width = 1500/2*.85, height = 500*2*.85)
wrap_plots(pv1, pv2, ncol = 1)
dev.off()

rm(pv1, pv2)



Keep in mind that Molino_2 has low nFeature_RNA and nCount_RNA.


Visualization of feature and count coorelation

seu_obj <- seu.list

plot_list <- list()
for (i in seq_along(names(seu.list))){
  plot_list[[i]] <-
    FeatureScatter(seu_obj[[i]], feature1 = "nCount_RNA", feature2 = "nFeature_RNA")  + NoLegend() 
  plot_list[[i]] <- plot_list[[i]] + labs(caption = names(seu.list)[i])
}

png(filename = paste0(plot_root, "All_cells_sca.png"), width = 300, height = 300*length(seu.list))
wrap_plots(plot_list, ncol = 1)
dev.off()

rm(plot_list, i, seu_obj)


Detection-based filtering

Cells with unique RNA features of less than 200 or more than mean plus 3 fold of standard deviation were considered as low-quality cells and removed from the downstream analysis. Note that mitochondrial counts were not used for filtering due to lack of annotation.

##detection based filtering
###calculate mean+3*standard deviation as the high cutoff value
qc_list <- vector(mode = "integer",
                  length = length(seu.list))
for (i in seq_along(names(seu.list))){
  qc_list[i] <- round(mean(seu.list[[i]]$nFeature_RNA) + 3 * sd(seu.list[[i]]$nFeature_RNA),
                      digits = 0)
}

###filter genes according to detection
seu_obj_sub <- list()
for (i in seq_along(names(seu.list))){
  seu_obj_sub[[i]] <- subset(seu.list[[i]],
                              nFeature_RNA > 200 & nFeature_RNA < qc_list[i])
}
names(seu_obj_sub) <- names(seu.list)

save(seu_obj_sub, file = paste0(inter_data, "All_cells_post_detection_based_filtering.RData"))
rm(qc_list)

#detection based QC stats table
pre.qc <- vector("double", length(seu.list))
post.qc <- vector("double", length(seu_obj_sub))
for (i in seq_along(names(seu.list))){
  pre.qc[i] <- ncol(seu.list[[i]])
  post.qc[i] <- ncol(seu_obj_sub[[i]])
}
qc_df <- data.frame(pre.qc, post.qc)
rownames(qc_df) <- names(seu.list)

qc_df$excluded <- qc_df$pre.qc - qc_df$post.qc
qc_df$excluded_pecentage <- paste0(round(qc_df$excluded / qc_df$pre.qc, digits = 4) * 100, "%")

colnames(qc_df) <- c("Pre_QC", "Post_QC", "Low_quality_cells", "Low_quality_cell_pecentage")

save(qc_df, file = paste0(inter_data, "for_plot_qc_stat.RData"))

rm(qc_df, i, pre.qc, post.qc, seu_m)

Detection-based QC stats table

load(file = "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/data/interdata/analysis_v1/for_plot_qc_stat.RData")

knitr::kable(qc_df,
             align = "c")
Pre_QC Post_QC Low_quality_cells Low_quality_cell_pecentage
Surface_fish_1 3190 2903 287 9%
Surface_fish_2 5900 5154 746 12.64%
Pachon_1 2209 1822 387 17.52%
Pachon_2 4453 3784 669 15.02%
Molino_1 3252 2952 300 9.23%
Molino_2 1421 1352 69 4.86%


Doublet-based QC

We will try a conserved doublet rate (half of the estimated doublet rate).

#run on cerebro: 428631 25 min 10G

library(dplyr) #tidy data and pipe assignment
library(Seurat) #single cell infrastructure
library(zeallot) #pipe assignment
library(magrittr) #pipe assignment
library(DoubletFinder) #find doublets

#set up root paths
root <- "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/"
##root path for intermediate data
inter_data <- paste0(root, "data/interdata/analysis_v1/")
# system(paste0("mkdir -p ", inter_data))
##root path for deliverables
del_root <- paste0(root, "results/deliverable/analysis_v1/")
# system(paste0("mkdir -p ", del_root))
##root path for plots
plot_root <- paste0(root, "results/plots/analysis_v1/")
# system(paste0("mkdir -p ", plot_root))

rm(root)

load(file = paste0(inter_data, "All_cells_post_detection_based_filtering.RData"))

###pre-process Seurat objects
seu_obj_sub_dbpre <- list()
for (i in seq_along(names(seu_obj_sub))){
  seu_obj_sub_dbpre[[i]] <- seu_obj_sub[[i]] %>% 
    SCTransform(vst.flavor = "v2",
                variable.features.n = 3000) %>%
    RunPCA() %>%
    RunUMAP(dims = 1:30) %>%
    FindNeighbors(dims = 1:30) %>%
    FindClusters(resolution = 0.5)
}
save(seu_obj_sub_dbpre, file = paste0(inter_data, "TEMP_seu_obj_sub_dbpre.RData"))#delete later

#load(file = paste0(inter_data, "TEMP_seu_obj_sub_dbpre.RData"))

###pK Identification (no ground-truth)
pK_ident <- function(x) {
  paramSweep(x,
             PCs = 1:30,
             sct = TRUE) %>%
    summarizeSweep(GT = FALSE) %>%
    find.pK()
}

pK <- list()
for (i in seq_along(names(seu_obj_sub))){
  pK[[i]] <- pK_ident(seu_obj_sub_dbpre[[i]])
}

save(pK, file = paste0(inter_data, "TEMP_pK_ident_classic.RData"))#delete later

###find the optimal pK
#load(file = paste0(inter_data, "TEMP_pK_ident_classic.RData"))
max_pK <- function(x){
  arrange(x, desc(BCmetric)) %>%
    summarise(max = dplyr::first(pK))
}

pK_i <- list()
for (i in seq_along(names(seu_obj_sub))){
  pK_i[[i]] <- max_pK(pK[[i]])
}

###doublets number estimation
x <- c(500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000) 
y <- c(.4, .8, 1.6, 2.3, 3.1, 3.9, 4.6, 5.4, 6.1, 6.9, 7.6)
plot(x, y)
equation <- lm(y ~ x)

load(file = paste0(inter_data, "seuratobj.RData"))

seu_obj <- seu.list

####calculate how many doublets we expect in each sample
cell_exp <- list()
for (i in seq_along(names(seu_obj_sub))){
  cell_exp[[i]] <- round(ncol(seu_obj_sub[[i]]) * ((predict(equation, data.frame(x = ncol(seu_obj[[i]])))/2/100)))
}
names(cell_exp) <- names(seu_obj_sub)

save(pK_i, cell_exp, file = paste0(inter_data, "TEMP_for_3_dbf.RData"))#delete later

###doublets finding
###this step was run on cerebro using 3_dbf.R-----------------------------------
for (i in seq_along(names(seu_obj_sub))){
  seu_obj_sub_dbpre[[i]] <- doubletFinder(seu_obj_sub_dbpre[[i]],
                                          PCs = 1:30, pN = 0.25,
                                          pK = as.numeric(as.character(pK_i[[i]][[1]])),
                                          nExp = cell_exp[[i]][[1]],
                                          reuse.pANN = FALSE, sct = TRUE)
}

names(seu_obj_sub_dbpre) <- names(seu_obj_sub)

save(seu_obj_sub_dbpre, file = paste0(inter_data, "CEREBRO_dbf2.RData"))
load(file = paste0(inter_data, "CEREBRO_dbf2.RData"))

###change and rename the column to "DF.classifications" for visualization
DF.class_in_meta <- list()
for (i in seq_along(names(seu_obj_sub_dbpre))){
  DF.class_in_meta[[i]] <- grep("DF.classifications",
                                colnames(seu_obj_sub_dbpre[[i]]@meta.data),
                                value = TRUE)
}

for (i in seq_along(names(seu_obj_sub_dbpre))){
  seu_obj_sub_dbpre[[i]][["DF.classifications"]] <- seu_obj_sub_dbpre[[i]][[DF.class_in_meta[[i]]]]
  seu_obj_sub_dbpre[[i]][[DF.class_in_meta[[i]]]] <- NULL
}

rm(DF.class_in_meta, i)

pdf <- function(x){
  pd <- DimPlot(x, reduction = "umap", group.by = "DF.classifications")
  pc <- DimPlot(x, reduction = "umap", group.by = "seurat_clusters", label = TRUE, repel = TRUE) + 
    #scale_colour_manual(values = p67) + 
    labs(caption = as.character(x$sample_name)[1])
  pd + pc
}

db_plots <- list()
for (i in seq_along(names(seu_obj_sub_dbpre))){
  db_plots[[i]] <- pdf(seu_obj_sub_dbpre[[i]])
}

png(filename = paste0(plot_root, "All_cells_db2_UMAP.png"), width = 1000, height = 500*6)
wrap_plots(db_plots, ncol = 1)
dev.off()

rm(db_plots, pdf)



###doublets related visualization-Violin Plot
###(we expect doublets contain more feature number than singlets)
Vln.db <- function(x){
  VlnPlot(
    x,
    features = c("nFeature_RNA"),
    group.by = "DF.classifications"
  ) +
    labs(caption = as.character(x$sample_name)[1])
}

db_plots <- list()
for (i in seq_along(names(seu_obj_sub_dbpre))){
  db_plots[[i]] <- Vln.db(seu_obj_sub_dbpre[[i]])
}

png(filename = paste0(plot_root, "All_cells_db2_Vln.png"), width = 500, height = 500*6)
wrap_plots(db_plots, ncol = 1)
dev.off()

rm(db_plots, Vln.db)



###Remove doublets found by conserved doublet rate
seu_obj_singlets <- list()
for (i in seq_along(names(seu_obj_sub_dbpre))){
  seu_obj_singlets[[i]] <- subset(seu_obj_sub_dbpre[[i]],
                                  subset = DF.classifications == "Singlet")
}
names(seu_obj_singlets) <- names(seu_obj_sub_dbpre)

save(seu_obj_singlets, file = paste0(inter_data, "All_cells_singlets.RData"))
###doublets stat table
pre.db <- vector("double", length(names(seu_obj_sub_dbpre)))
post.db <- vector("double", length(names(seu_obj_singlets)))
for (i in seq_along(names(seu_obj_sub_dbpre))){
  pre.db[i] <- ncol(seu_obj_sub_dbpre[[i]])
  post.db[i] <- ncol(seu_obj_singlets[[i]])
}
db_df <- data.frame(pre.db, post.db)
rownames(db_df) <- names(seu_obj_sub_dbpre)

db_df$doublets <- db_df$pre.db - db_df$post.db
db_df$doublets_percentage <- paste0(round(db_df$doublets / db_df$pre.db, digits = 4) * 100, "%")

save(db_df, file = paste0(inter_data, "for_plot_db_stat.RData"))

rm(i, pre.db, post.db, db_df, seu_obj_sub_dbpre)

Doublet-based QC stats table

load(file = "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/data/interdata/analysis_v1/for_plot_db_stat.RData")

knitr::kable(db_df,
             align = "c")
pre.db post.db doublets doublets_percentage
Surface_fish_1 2903 2867 36 1.24%
Surface_fish_2 5154 5037 117 2.27%
Pachon_1 1822 1806 16 0.88%
Pachon_2 3784 3719 65 1.72%
Molino_1 2952 2915 37 1.25%
Molino_2 1352 1344 8 0.59%

Normalization and dimensional reduction

load(file = paste0(inter_data, "All_cells_singlets.RData"))

#SCTransform
seu_obj_singlets <- lapply(X = seu_obj_singlets,
                           FUN = function(x){
                             x <- SCTransform(x,
                                              vst.flavor = "v2",
                                              variable.features.n = 3000,
                                              return.only.var.genes = FALSE)
                           })

save(seu_obj_singlets, file = paste0(inter_data, "All_cells_post_SCTransform.RData"))

We will merge fish group wise replicates, then integrate the three fish groups.

load(file = paste0(inter_data, "All_cells_post_SCTransform.RData"))

## ===== sf ===== ##
sf_m <- merge(seu_obj_singlets$Surface_fish_1, y = seu_obj_singlets$Surface_fish_2,
              add.cell.ids = c("Surface_fish_1", "Surface_fish_2"))
sf_m$fish <- "Surface_fish"
sf_m %<>% PrepSCTFindMarkers(assay = "SCT")
VariableFeatures(sf_m) <- c(VariableFeatures(seu_obj_singlets$Surface_fish_1),
                            VariableFeatures(seu_obj_singlets$Surface_fish_2))

## ===== pa ===== ##
pa_m <- merge(seu_obj_singlets$Pachon_1, y = seu_obj_singlets$Pachon_2,
              add.cell.ids = c("Pachon_1", "Pachon_2"))
pa_m$fish <- "Pachon"
pa_m %<>% PrepSCTFindMarkers(assay = "SCT")
VariableFeatures(pa_m) <- c(VariableFeatures(seu_obj_singlets$Pachon_1),
                            VariableFeatures(seu_obj_singlets$Pachon_2))

## ===== mo ===== ##
mo_m <- merge(seu_obj_singlets$Molino_1, y = seu_obj_singlets$Molino_2,
              add.cell.ids = c("Molino_1", "Molino_2"))
mo_m$fish <- "Molino"
mo_m %<>% PrepSCTFindMarkers(assay = "SCT")
VariableFeatures(mo_m) <- c(VariableFeatures(seu_obj_singlets$Molino_1),
                            VariableFeatures(seu_obj_singlets$Molino_2))

save(sf_m, pa_m, mo_m, file = paste0(inter_data, "All_fish_merged.RData"))

rm(seu_obj_singlets)

library(harmony)
#integration using Harmony
all_merge <- list(sf_m, pa_m, mo_m)
names(all_merge) <- c("Surface_fish", "Pachon", "Molino")

var_features <- SelectIntegrationFeatures(object.list = all_merge,
                                          nfeatures = 3000)

seu_m <- merge(all_merge[[1]], y = all_merge[2:length(all_merge)],
               merge.data = TRUE)

VariableFeatures(seu_m) <- var_features
seu_m %<>%
  RunPCA() %>%
  RunHarmony(assay.use = "SCT",
             group.by.vars = "fish")

save(seu_m, file = paste0(inter_data, "all_fish_harmonied.RData"))

#elbow plot
p <- ElbowPlot(seu_m, ndims = 50) +
  ggtitle(label = "Elbowplot of PCs")

png(filename = paste0(plot_root, "all_fish_harmonied_PCA.png"), width = 800, height = 300)
plot(p)
dev.off()

rm(p, var_features, all_merge, sf_m, pa_m, mo_m)



PC 40 resolution 0.5

Based on the elbow plot, we include the first PCs in the downstream analysis.

UMAP and integration check

#pc40 res0.5
set.seed(10086)

i <- 40
j <- 0.5

seu_m.c <- seu_m %>%
  FindNeighbors(dims = 1:i, reduction = "harmony") %>%
  FindClusters(resolution = j) %>%
  RunUMAP(dims = 1:i, reduction = "harmony")

curio26 <- readRDS("~/color/curio26.rds")
ph <- DimPlot(seu_m.c, label = TRUE, repel = TRUE, raster = FALSE, cols = curio26) +
  NoLegend() +
  coord_fixed()

png(filename = paste0(plot_root, "all_fish_harmonied_PC40res0.5.png"), width = 800, height = 600)
ph
dev.off()

rm(ph)



ps <- DimPlot(seu_m.c, group.by = "fish") +
  coord_fixed()

png(filename = paste0(plot_root, "all_fish_harmonied_PC40res0.5_fish.png"), width = 800, height = 600)
ps
dev.off()

rm(ps)



ps <- DimPlot(seu_m.c, split.by = "fish", group.by = "seurat_clusters", cols = curio26, label = TRUE, pt.size = .6) +
  coord_fixed()

png(filename = paste0(plot_root, "all_fish_harmonied_PC40res0.5_fish_cluster.png"), width = 600*3, height = 600)
ps
dev.off()

rm(ps)



#sample wise UMAP
p <-
  DimPlot(
    seu_m.c,
    group.by = "seurat_clusters",
    split.by = "sample_name",
    cols = curio26,
    label = TRUE,
    pt.size = 1,
    ncol = 2
  ) +
  coord_fixed()

png(filename = paste0(plot_root, "all_fish_harmonied_PC40res0.5_sample_cluster.png"), width = 600*2, height = 600*2.6)
p
dev.off()

rm(p)



Exploration of known markers

seu_m.c %<>% PrepSCTFindMarkers(assay = "SCT")

save(seu_m.c, file = paste0(inter_data, "all_fish_harmonied_post_SCTmarkers_PC40res.5.RData"))

pf <- scCustomize::FeaturePlot_scCustom(seu_m.c,
                                        features = c("ddx4",
                                                     "gsdf",
                                                     "cyp11a1.1",
                                                     "col1a1a",
                                                     "mpx",
                                                     "MPEG1.1", 
                                                     #"nkl.2", not found
                                                     "fli1"),
                                        num_columns = 4
                                        ) &
  coord_fixed() &
  NoAxes()

png(filename = paste0(plot_root, "all_fish_harmonied_PC40res.5_zf_markers.png"), width = 500*4*.8, height = 450*2*.8)
pf
dev.off()



Integration with zebrafish data

To help with cell type annotation, we will integrate the data with public zebrafish data.

Zebrafish paper seurat object download and processing

## ===== download their seurat obj (their descriptions are not very clear) ===== ##
library(rvest)

url_link <- "https://datadryad.org/stash/share/CEd0Zs4oZKdinTWeJPKbWYjBq6hYq4QhVacQcFjf37E"

files_avail <- read_html(url_link) %>%
  html_nodes("a") 

to_download <- grep(pattern = "final",
                    x = files_avail,
                    value = TRUE)

hrefs <- vector("character", length(to_download))
titles <- vector("character", length(to_download))

#extract hrefs and titles
for (i in seq_along(to_download)) {
  element <- read_html(to_download[i])

  hrefs[i] <- html_attr(html_node(element, "a"), "href")
  
  titles[i] <- html_attr(html_node(element, "a"), "title")
}

save_path <- "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/data/pub_data/seu_obj/"
system(paste0("mkdir -p ", save_path))

for (i in seq_along(to_download)){
 system(paste0("wget -O ", save_path, titles[i], " https://datadryad.org", hrefs[i])) 
}

Raw data (fastq) download and preprocessing

Download fastq files from GEO

#had issue installing GEOquery, we will just get the SRR IDs mannually since we only have three samples
#40 day post-fertilization zebrafish ovaries cells, sorted germ cells: SRR17262952
#40 day post-fertilization zebrafish ovaries cells (zx2): SRR17262951   
#40 day post-fertilization zebrafish ovaries cells (zx4): SRR17262950   
module load sratoolkit/3.0.1

prefetch -X 200G SRR17262950
fasterq-dump SRR17262950

Cell Ranger Count

#need to rename files from SRRxxxxx_1.fastq to SRRxxxxx_S1_L001_R1_001.fastq

#make a sample sheet for array jobs
ss <- data.frame(sample = c("SRR17262950", "SRR17262951", "SRR17262952"))

write.table(ss,
            sep = " ",
            file = paste0("/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/code/CEREBRO_related/sample_sheet_cellranger.txt"),
            quote = FALSE,
            row.names = FALSE, col.names = FALSE)

rm(ss)
#!/bin/bash
#SBATCH --time=168:00:00
#SBATCH --mem=200G
#SBATCH --job-name=cellranger_count
#SBATCH --cpus-per-task=32
#SBATCH --array=1-3
#SBATCH --output=array_job_%A_%a.out
#SBATCH --error=array_job_%A_%a.err

sample=$(cat /n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/code/CEREBRO_related/sample_sheet_cellranger.txt|\
         awk -v var=$SLURM_ARRAY_TASK_ID 'NR==var {print $1}')

module load cellranger/7.1.0

cd /n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/data/pub_data/fastq/${sample}

cellranger count --id=${sample} \
                 --transcriptome=/n/analysis/genomes/Danio_rerio/GRCz11_primary_assembly/annotation/Ens_110/10x/cellranger-7.1.0 \
                 --fastqs=/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/data/pub_data/fastq/${sample} \
                 --sample=${sample} \
                 --localcores=32 \
                 --expect-cells=10000

##first round no expect cells
##3hrs 32 cores ~140G each
##report _noexpcells

##second round added expect cells according to the author GEO description
##400144
#only the first one succeeded, the other two failed
#I remember yesterday, the first one also failed, maybe I will just try resubmitting the other two
#400250_2 400250_3

Note that in the paper, the option –expect-cells was set to 10000, we will stick to that.


QC

Ambient RNA correction with CellBender 0.3.0

#create a sample sheet for array job with path to output
ss <- data.frame(sample = c("SRR17262950", "SRR17262951", "SRR17262952"),
                 path = c("/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/data/pub_data/fastq/SRR17262950/SRR17262950/outs",
                          "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/data/pub_data/fastq/SRR17262951/SRR17262951/outs",
                          "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/data/pub_data/fastq/SRR17262952/SRR17262952/outs"))

write.table(ss,
            sep = " ",
            file = paste0("/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/code/CEREBRO_related/sample_sheet_cellbender_pub.txt"),
            quote = FALSE,
            row.names = FALSE, col.names = FALSE)
#!/bin/bash
#SBATCH --partition gpu
#SBATCH --gres=gpu:a100:1
#SBATCH --nodes=1 
#SBATCH --ntasks=1 
#SBATCH --mem=100GB
#SBATCH --job-name=cellbender
#SBATCH --array=1-3
#SBATCH --output=array_job_%A_%a.out
#SBATCH --error=array_job_%A_%a.err

. ~/anaconda3/etc/profile.d/conda.sh

conda activate cellbender

paths=$(cat /n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/code/CEREBRO_related/sample_sheet_cellbender_pub.txt|\
         awk -v var=$SLURM_ARRAY_TASK_ID 'NR==var {print $2}')

cd ${paths}

cellbender remove-background \
      --cuda \
      --input raw_feature_bc_matrix.h5 \
      --output output.h5

ptrepack --complevel 5 output_filtered.h5:/matrix output_filtered_seurat.h5:/matrix


Detection-based quality control (QC)

#read in post-cell bender data
paths <- "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/data/pub_data/fastq/"
h5_paths <- list.files(paths, pattern = "output_filtered_seurat.h5", full.names = TRUE, recursive = TRUE)

names(h5_paths) <- c("zx4", "zx2", "sorted_germ_cells")

seu.list <- list()
for (i in seq_along(names(h5_paths))){
  data.list <- Read10X_h5(filename = h5_paths[i],
                          use.names = TRUE)

  seu.list[[i]] <- CreateSeuratObject(counts = data.list,
                                      project = names(h5_paths)[i])
  seu.list[[i]]$sample_name <- names(h5_paths)
}
names(seu.list) <- names(h5_paths)

#use nCount>500 as real cells like 10x did to keep real cells only
seu.list <- lapply(X = seu.list,
                   FUN = function(x){
                     x <- subset(x, subset = nCount_RNA > 500)
                   })

rm(data.list, paths, h5_paths, i)

save(seu.list, file = paste0(inter_data, "zf_seuratobj.RData"))

QC visualization

Visualization of nFeature, nCount, percentage of mitochondrial genes and ribosomal genes
#QC
##calculate percentage of mitochondrial genes and ribosomal genes
##Ribosomal genes also tend to be very highly represented, and can vary between cell types, so it can be instructive to see how prevalent they are in the data.
##These are ribosomal protein genes rather than the actual rRNA, so they are more a measure of the translational activity of the cell rather than the cleanliness of the polyA selection.

for (i in seq_along(names(seu.list))){
  seu.list[[i]][["percent.mt"]] <-
    PercentageFeatureSet(seu.list[[i]], pattern = "^mt-")
  seu.list[[i]][["percent.ribo"]] <-
    PercentageFeatureSet(seu.list[[i]], pattern = "^rp[l|s]")
}

#plot QC
##merge for visualization
seu_m <- merge(seu.list[[1]], y = seu.list[2:length(seu.list)],
               add.cell.ids = names(seu.list))

pv1 <- VlnPlot(seu_m,
               features = c("nFeature_RNA"),
               pt.size = 0,
               cols = nord::nord_palettes$aurora,
               ncol = 1) +
  NoLegend() +
  xlab(label = "")
pv2 <- VlnPlot(seu_m,
               features = c("nCount_RNA"),
               pt.size = 0,
               cols = nord::nord_palettes$aurora,
               y.max = 150000,
               ncol = 1) +
  NoLegend() +
  xlab(label = "")
pv3 <- VlnPlot(seu_m,
               features = c("percent.mt"),
               pt.size = 0,
               cols = nord::nord_palettes$aurora,
               y.max = 100,
               ncol = 1) +
  NoLegend() +
  xlab(label = "")
pv4 <- VlnPlot(seu_m,
               features = c("percent.ribo"),
               pt.size = 0,
               cols = nord::nord_palettes$aurora,
               y.max = 100,
               ncol = 1) +
  NoLegend() +
  xlab(label = "")

png(filename = paste0(plot_root, "zf_qc.png"), width = 800, height = 800)
wrap_plots(pv1, pv3, pv2, pv4, ncol = 2)
dev.off()

rm(pv1, pv2, pv3, pv4)



Visualization of feature and count coorelation
seu_obj <- seu.list

plot_list <- list()
for (i in seq_along(names(seu.list))){
  plot_list[[i]] <-
    FeatureScatter(seu_obj[[i]], feature1 = "nCount_RNA", feature2 = "percent.mt")  + NoLegend() +
    FeatureScatter(seu_obj[[i]], feature1 = "nCount_RNA", feature2 = "nFeature_RNA")  + NoLegend() +
    FeatureScatter(seu_obj[[i]], feature1 = "percent.ribo", feature2 = "percent.mt") + NoLegend()
  plot_list[[i]] <- plot_list[[i]] + labs(caption = names(seu.list)[i])
}

png(filename = paste0(plot_root, "zf_sca.png"), width = 300*3, height = 300*length(seu.list))
wrap_plots(plot_list, ncol = 1)
dev.off()

rm(plot_list, i, seu_obj)



Detection-based filtering

#from the paper:
#For the sorted germ cell library, we retained cells with 200–6000 genes, less than 150,000 unique transcripts, and less than 5% mitochondrial transcripts. For whole ovary libraries, we retained the cells with 200–8000 genes, less than 200,000 unique transcripts, and less than 20% mitochondrial transcripts. These cutoffs were determined by the gene and transcript distributions of those libraries. Blood cells and a small number of germ cells were also removed from the whole ovary libraries. DoubletFinder was run using a conservative 5% estimated cell doublet cutoff 

##detection based filtering
###calculate mean+3*standard deviation as the high cutoff value
qc_list <- vector(mode = "integer",
                  length = length(seu.list))
for (i in seq_along(names(seu.list))){
  qc_list[i] <- round(mean(seu.list[[i]]$nFeature_RNA) + 3 * sd(seu.list[[i]]$nFeature_RNA),
                      digits = 0)
}

# qc_list
# [1] 6076 6176 9645

#we will start with our own cutoffs but their mitochondiral cutoff
mito_cutoff <- c(20, 20, 5)

###filter genes according to both detection and mito percentage
seu_obj_sub <- list()
for (i in seq_along(names(seu.list))){
  seu_obj_sub[[i]] <- subset(seu.list[[i]],
                              nFeature_RNA > 300 & nFeature_RNA < qc_list[i] & percent.mt < mito_cutoff[i])
}
names(seu_obj_sub) <- names(seu.list)

save(seu_obj_sub, file = paste0(inter_data, "zf_post_detection_based_filtering.RData"))
rm(qc_list, mito_cutoff)

#detection based QC stats table
pre.qc <- vector("double", length(seu.list))
post.qc <- vector("double", length(seu_obj_sub))
for (i in seq_along(names(seu.list))){
  pre.qc[i] <- ncol(seu.list[[i]])
  post.qc[i] <- ncol(seu_obj_sub[[i]])
}
qc_df <- data.frame(pre.qc, post.qc)
rownames(qc_df) <- names(seu.list)

qc_df$excluded <- qc_df$pre.qc - qc_df$post.qc
qc_df$excluded_pecentage <- paste0(round(qc_df$excluded / qc_df$pre.qc, digits = 4) * 100, "%")

colnames(qc_df) <- c("Pre_QC", "Post_QC", "Low_quality_cells", "Low_quality_cell_pecentage")

save(qc_df, file = paste0(inter_data, "for_plot_qc_stat_zf.RData"))

rm(qc_df, i, pre.qc, post.qc, seu_m)

Detection-based QC stats table

load(file = "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/data/interdata/analysis_v1/for_plot_qc_stat_zf.RData")

knitr::kable(qc_df,
             align = "c")
Pre_QC Post_QC Low_quality_cells Low_quality_cell_pecentage
zx4 12084 9611 2473 20.47%
zx2 11489 8992 2497 21.73%
sorted_germ_cells 11989 11341 648 5.4%

Doublet-based QC

We will use a conserved doublet rate as the paper did (the paper used 5%).

#run on cerebro: 409729  10 min 23G

library(dplyr) #tidy data and pipe assignment
library(Seurat) #single cell infrastructure
library(zeallot) #pipe assignment
library(magrittr) #pipe assignment
library(DoubletFinder) #find doublets

#set up root paths
root <- "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/"
##root path for intermediate data
inter_data <- paste0(root, "data/interdata/analysis_v1/")
# system(paste0("mkdir -p ", inter_data))
##root path for deliverables
del_root <- paste0(root, "results/deliverable/analysis_v1/")
# system(paste0("mkdir -p ", del_root))
##root path for plots
plot_root <- paste0(root, "results/plots/analysis_v1/")
# system(paste0("mkdir -p ", plot_root))

rm(root)

load(file = paste0(inter_data, "zf_post_detection_based_filtering.RData"))

# ###pre-process Seurat objects
# seu_obj_sub_dbpre <- list()
# for (i in seq_along(names(seu_obj_sub))){
#   seu_obj_sub_dbpre[[i]] <- seu_obj_sub[[i]] %>% 
#     SCTransform(vst.flavor = "v2",
#                 variable.features.n = 3000) %>%
#     RunPCA() %>%
#     RunUMAP(dims = 1:30) %>%
#     FindNeighbors(dims = 1:30) %>%
#     FindClusters(resolution = 0.5)
# }
# save(seu_obj_sub_dbpre, file = paste0(inter_data, "TEMP_seu_obj_sub_dbpre.RData"))#delete later

load(file = paste0(inter_data, "TEMP_seu_obj_sub_dbpre.RData"))

# ###pK Identification (no ground-truth)
# pK_ident <- function(x) {
#   paramSweep(x,
#              PCs = 1:30,
#              sct = TRUE) %>%
#     summarizeSweep(GT = FALSE) %>%
#     find.pK()
# }
# 
# pK <- list()
# for (i in seq_along(names(seu_obj_sub))){
#   pK[[i]] <- pK_ident(seu_obj_sub_dbpre[[i]])
# }
# 
# save(pK, file = paste0(inter_data, "TEMP_pK_ident_classic.RData"))#delete later

###find the optimal pK
load(file = paste0(inter_data, "TEMP_pK_ident_classic.RData"))
max_pK <- function(x){
  arrange(x, desc(BCmetric)) %>%
    summarise(max = dplyr::first(pK))
}

pK_i <- list()
for (i in seq_along(names(seu_obj_sub))){
  pK_i[[i]] <- max_pK(pK[[i]])
}

# ###doublets number estimation
# x <- c(500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000) 
# y <- c(.4, .8, 1.6, 2.3, 3.1, 3.9, 4.6, 5.4, 6.1, 6.9, 7.6)
# plot(x, y)
# equation <- lm(y ~ x)
# 
# load(file = paste0(inter_data, "zf_seuratobj.RData"))
# 
# seu_obj <- seu.list

####calculate how many doublets we expect in each sample
cell_exp <- list()
for (i in seq_along(names(seu_obj_sub))){
  cell_exp[[i]] <- round(ncol(seu_obj_sub[[i]]) * 0.05)
}
names(cell_exp) <- names(seu_obj_sub)

# save(pK_i, cell_exp, file = paste0(inter_data, "TEMP_for_3_dbf.RData"))#delete later

###doublets finding
###this step was run on cerebro using 3_dbf.R-----------------------------------
for (i in seq_along(names(seu_obj_sub))){
  seu_obj_sub_dbpre[[i]] <- doubletFinder(seu_obj_sub_dbpre[[i]],
                                          PCs = 1:30, pN = 0.25,
                                          pK = as.numeric(as.character(pK_i[[i]][[1]])),
                                          nExp = cell_exp[[i]][[1]],
                                          reuse.pANN = FALSE, sct = TRUE)
}

names(seu_obj_sub_dbpre) <- names(seu_obj_sub)

save(seu_obj_sub_dbpre, file = paste0(inter_data, "CEREBRO_dbf_zf_0.05.RData"))
load(file = paste0(inter_data, "CEREBRO_dbf_zf_0.05.RData"))

###change and rename the column to "DF.classifications" for visualization
DF.class_in_meta <- list()
for (i in seq_along(names(seu_obj_sub_dbpre))){
  DF.class_in_meta[[i]] <- grep("DF.classifications",
                                colnames(seu_obj_sub_dbpre[[i]]@meta.data),
                                value = TRUE)
}

for (i in seq_along(names(seu_obj_sub_dbpre))){
  seu_obj_sub_dbpre[[i]][["DF.classifications"]] <- seu_obj_sub_dbpre[[i]][[DF.class_in_meta[[i]]]]
  seu_obj_sub_dbpre[[i]][[DF.class_in_meta[[i]]]] <- NULL
}

rm(DF.class_in_meta, i)

pdf <- function(x){
  pd <- DimPlot(x, reduction = "umap", group.by = "DF.classifications")
  pc <- DimPlot(x, reduction = "umap", group.by = "seurat_clusters", label = TRUE, repel = TRUE) + 
    #scale_colour_manual(values = p67) + 
    labs(caption = as.character(x$sample_name)[1])
  pd + pc
}

db_plots <- list()
for (i in seq_along(names(seu_obj_sub_dbpre))){
  db_plots[[i]] <- pdf(seu_obj_sub_dbpre[[i]])
}

png(filename = paste0(plot_root, "zf_db2_UMAP.png"), width = 1000, height = 500*3)
wrap_plots(db_plots, ncol = 1)
dev.off()

rm(db_plots, pdf)



###doublets related visualization-Violin Plot
###(we expect doublets contain more feature number than singlets)
Vln.db <- function(x){
  VlnPlot(
    x,
    features = c("nFeature_RNA"),
    group.by = "DF.classifications"
  ) +
    labs(caption = as.character(x$sample_name)[1])
}

db_plots <- list()
for (i in seq_along(names(seu_obj_sub_dbpre))){
  db_plots[[i]] <- Vln.db(seu_obj_sub_dbpre[[i]])
}

png(filename = paste0(plot_root, "zf_db2_Vln.png"), width = 500, height = 500*3)
wrap_plots(db_plots, ncol = 1)
dev.off()

rm(db_plots, Vln.db)



###Remove doublets
seu_obj_singlets <- list()
for (i in seq_along(names(seu_obj_sub_dbpre))){
  seu_obj_singlets[[i]] <- subset(seu_obj_sub_dbpre[[i]],
                                  subset = DF.classifications == "Singlet")
}
names(seu_obj_singlets) <- names(seu_obj_sub_dbpre)

save(seu_obj_singlets, file = paste0(inter_data, "zf_singlets.RData"))

Doublet-based QC stats table

###doublets stat table
pre.db <- vector("double", length(names(seu_obj_sub_dbpre)))
post.db <- vector("double", length(names(seu_obj_singlets)))
for (i in seq_along(names(seu_obj_sub_dbpre))){
  pre.db[i] <- ncol(seu_obj_sub_dbpre[[i]])
  post.db[i] <- ncol(seu_obj_singlets[[i]])
}
db_df <- data.frame(pre.db, post.db)
rownames(db_df) <- names(seu_obj_sub_dbpre)

db_df$doublets <- db_df$pre.db - db_df$post.db
db_df$doublets_percentage <- paste0(round(db_df$doublets / db_df$pre.db, digits = 4) * 100, "%")

save(db_df, file = paste0(inter_data, "for_plot_db2_stat.RData"))

rm(i, pre.db, post.db, db_df, seu_obj_sub_dbpre)
load(file = "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/data/interdata/analysis_v1/for_plot_db2_stat.RData")

knitr::kable(db_df,
             align = "c")
pre.db post.db doublets doublets_percentage
zx4 9611 9130 481 5%
zx2 8992 8542 450 5%
sorted_germ_cells 11341 10774 567 5%


Merge, normalization and dimensional reduction

#didn't get similar UMAP, figured that the author merged the data and did not integrate

load(file = paste0(inter_data, "zf_singlets.RData"))

merged <- merge(seu_obj_singlets[[1]], y = seu_obj_singlets[2:length(seu_obj_singlets)],
                add.cell.ids = names(seu_obj_singlets))

# merged[["RNA"]] <- split(merged[["RNA"]],
#                          f = merged$sample_name)

merged %<>% SCTransform(vst.flavor = "v2",
                           variable.features.n = 3000,
                           return.only.var.genes = FALSE)


#prep for DE analysis and visualization
merged %<>% PrepSCTFindMarkers(assay = "SCT")

#variable features
VariableFeatures(merged) <- c(
  VariableFeatures(seu_obj_singlets$sorted_germ_cells),
  VariableFeatures(seu_obj_singlets$zx2),
  VariableFeatures(seu_obj_singlets$zx4)
)

merged %<>% RunPCA()

save(merged, file = paste0(inter_data, "zf_merged_sctpreped.RData"))

load(file = paste0(inter_data, "zf_merged_sctpreped.RData"))

#above was ran on cerebro interactive session

##elbowplot
p <- ElbowPlot(merged, ndims = 50) +
  ggtitle("Elbow plot of PCs")

png(filename = paste0(plot_root, "zf_merged_elbow.png"), width = 800, height = 300)
p
dev.off()

rm(p)



22 PCs were included for the following analysis to be consistent with the paper.

set.seed(10086)

i = 22
j = .2

merged.c <- merged %>%
  FindNeighbors(dims = 1:i) %>%
  FindClusters(resolution = j) %>%
  RunUMAP(dims = 1:i)

save(merged.c, file = paste0(inter_data, "zf_merged_PC22res0.2.RData"))

#color35 <- readRDS("~/color/color35.rds")
#color19_nord <- readRDS("~/color/color19_nord.rds")

p <- DimPlot(merged.c,
             #cols = color35,
             label = TRUE,
             repel = TRUE,
             label.box = TRUE,
             pt.size = .2) +
  coord_fixed(ratio = 1)

png(filename = paste0(plot_root, "zf_merged_PC20res0.2.png"), width = 800, height = 600)
p
dev.off()

rm(p)



Label transfer

This step transfers cell annotation labels from the seurat object the authors generated to our merged object.

#label transfer

## ===== reference ===== ##
load(file = "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/data/pub_data/seu_obj/zx124_40com_final_orig.robj")
#update seurat obj
zx124_40com_final_orig <- UpdateSeuratObject(zx124_40com_final_orig)
zx124_40com_final_orig$annotation <- zx124_40com_final_orig@active.ident

zf_ref <- zx124_40com_final_orig
rm(zx124_40com_final_orig)

pref <- DimPlot(zf_ref, label.box = TRUE, label = TRUE) +
  NoLegend() +
  ggtitle("Cell type annotation from the author") +
  coord_fixed()

png(filename = paste0(plot_root, "zf_ref.png"), width = 800, height = 600)
pref
dev.off()

rm(pref)



We may not need such a high resolution of cell types without actual meaning, we will combine the subclusters.


zf_ref_meta <- zf_ref@meta.data
zf_ref_meta$annotation_sm <- zf_ref_meta$annotation
zf_ref_meta$annotation_sm <- gsub("Unknown_1|Unknown_2", "Unknown", zf_ref_meta$annotation_sm)
zf_ref_meta$annotation_sm <- gsub("^Follicle.*", "Follicle", zf_ref_meta$annotation_sm)
zf_ref_meta$annotation_sm <- gsub("^Stromal.*", "Stromal", zf_ref_meta$annotation_sm)
zf_ref_meta$annotation_sm <- gsub("^Early.*|^Late.*|^Meio|^GSC.*", "Germ_cells", zf_ref_meta$annotation_sm)

#double checked the number of cells looks right

to_add <- zf_ref_meta$annotation_sm
names(to_add) <- rownames(zf_ref_meta)

zf_ref$annotation_sm <- to_add

save(zf_ref, file = paste0(inter_data, "zf_ref.RData"))

psm <- DimPlot(zf_ref, label.box = TRUE, label = TRUE, group.by = "annotation_sm") +
  NoLegend() +
  ggtitle("Cell type annotation in paper") +
  coord_fixed()

png(filename = paste0(plot_root, "zf_ref_sm.png"), width = 800, height = 600)
psm
dev.off()

rm(psm)
rm(to_add, zf_ref_meta)



#query: merged.c
#reference: zf_ref

load(file = paste0(inter_data, "zf_ref.RData"))

DefaultAssay(zf_ref) <- "RNA"
zf_ref %<>% NormalizeData() %>%
  FindVariableFeatures() %>%
  ScaleData()

load(file = paste0(inter_data, "zf_merged_PC22res0.2.RData"))
DefaultAssay(merged.c) <- "RNA"
merged.c %<>% NormalizeData() %>%
  FindVariableFeatures() %>%
  ScaleData()

anchors <- FindTransferAnchors(reference = zf_ref,
                               query = merged.c,
                               reference.reduction = "pca",
                               dims = 1:22)
predictions <- TransferData(anchorset = anchors,
                            refdata = zf_ref$annotation_sm,
                            dims = 1:22)
merged.c <- AddMetaData(merged.c, metadata = predictions)
rm(anchors)

save(merged.c, file = paste0(inter_data, "zf_merged_PC22res0.2.RData"))

p <- DimPlot(merged.c,
             label = TRUE,
             repel = TRUE,
             label.box = TRUE,
             pt.size = .2,
             group.by = "predicted.id") +
  coord_fixed(ratio = 1)
png(filename = paste0(plot_root, "zf_merged_PC22res0.2_pred.png"), width = 800, height = 600)
p
dev.off()

rm(p)

#FindTransferAnchors error: Normalizing query using reference SCT model
# Warning: No layers found matching search pattern provided
# Projecting cell embeddings
# Error in h(simpleError(msg, call)) : 
#   error in evaluating the argument 'x' in selecting a method for function 't': argument is of length zero



Take a closer look at cluster 11

DefaultAssay(merged.c) <- "SCT"

pf <- FeaturePlot(merged.c,
                  features = c("prediction.score.Follicle", "prediction.score.Germ_cells")) &
  coord_fixed()

png(filename = paste0(plot_root, "zf_merged_PC22res0.2_pred_feat.png"), width = 800*1.6, height = 600)
pf
dev.off()

rm(pf)



#marker exploration
DefaultAssay(merged.c) <- "SCT"

pf <- FeaturePlot(merged.c,
                  features = c("gsdf", "ddx4")) &
  coord_fixed()

png(filename = paste0(plot_root, "zf_merged_PC22res0.2_pred_mar.png"), width = 800*1.6, height = 600)
pf
dev.off()

rm(pf)



p1 <- DimPlot(zf_ref,
              group.by = "orig.ident") +
  coord_fixed()

p2 <- DimPlot(merged.c,
              group.by = "orig.ident") +
  coord_fixed()

png(filename = paste0(plot_root, "zf_ref_orig.png"), width = 800*2, height = 600)
wrap_plots(list(p1, p2), ncol = 2)
dev.off()

rm(p1, p2)



For ovary samples, the authors removed germ cells according to the ddx4 expression, and oocytes according to zp3 expression. We would expect to see germ cells here (potentially cluster 11 here).


#zp3 expression

p <- FeaturePlot(merged.c,
                 features = c("zp3")) +
  coord_fixed()

png(filename = paste0(plot_root, "zf_merged_PC22res0.2_zp3.png"), width = 800, height = 600)
p
dev.off()

rm(p)



Fanning pointed out zp3 expression makes sense, so we will keep cluster 11 in SAMap analysis.

Integration by using SAMap

#!/bin/bash
#SBATCH --time=168:00:00
#SBATCH --mem=1300G
#SBATCH --partition=bigmem
#SBATCH --job-name=map_genes
#SBATCH --cpus-per-task=98
#SBATCH --error=job.%J.err
#SBATCH --output=job.%J.out

module load blast-plus/2.13.0

/n/core/Bioinformatics/analysis/Yu/lim/cbio.lim.102/code/R_scripts_dw2733/SAMap_related/map_genes.sh --tr1 /n/analysis/genomes/Danio_rerio/GRCz11_primary_assembly/annotation/Ens_110/RSEM/GRCz11.Ens_110.RSEM.transcripts.fa --t1 nucl --n1 dr --tr2 /n/analysis/genomes/Astyanax_mexicanus/Astyanax_mexicanus-2.0/annotation/Ens_110/RSEM/Astyanax_mexicanus-2.0.Ens_110.RSEM.transcripts.fa --t2 nucl --n2 am --threads 98

#402256: 4G 6hrs, did not need to use bigmem
## ===== add gene identity to tblastx results ===== ##
#read in transcript file with transcript id to gene id
dr <- read.delim(file = "/n/analysis/genomes/Danio_rerio/GRCz11_primary_assembly/annotation/Ens_110/tables/GRCz11.Ens_110.transcript_data.txt",
                 sep = "\t")
dr_keep <- dr %>%
  select(Transcript_ID, Gene_ID)
rm(dr)

am <- read.delim(file = "/n/analysis/genomes/Astyanax_mexicanus/Astyanax_mexicanus-2.0/annotation/Ens_110/tables/Astyanax_mexicanus-2.0.Ens_110.transcript_data.txt",
                 sep = "\t")
am_keep <- am %>%
  select(Transcript_ID, Gene_ID)
rm(am)

#read in tblastx results
paths <- list.files(path = "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/code/CEREBRO_related/maps_orig/dram", full.names = TRUE)

##am to dr
am_to_dr <- read.table(file = paths[1], sep = "\t", header = FALSE)
colnames(am_to_dr)[1:2] <- c("am_t", "dr_t")

am_to_dr_mu <- am_to_dr %>%
  mutate(am_g = am_keep$Gene_ID[match(am_t, am_keep$Transcript_ID)],
         dr_g = dr_keep$Gene_ID[match(dr_t, dr_keep$Transcript_ID)])

#after replacing the transcript IDs with gene IDs, does the bit score of the same pair change due to multi-mapping between transcripts and gene
am_to_dr_to_save <- am_to_dr_mu %>%
  select(am_g, dr_g, V3, V4,V5, V6, V6, V7, V8, V9, V10, V11, V12)

##check if rows with duplicated am_g and dr_g have the same value in V12
dup_res <- am_to_dr_to_save %>%
  group_by(am_g, dr_g) %>%
  summarize(n = n(),
            same = n_distinct(V12) == 1) #%>%
 # filter(n > 1, same == FALSE)

##we do have duplicates, but SAMap handles that:
# When a pair of genes share multiple High Scoring Pairs (HSPs), which are local regions of matching sequences, we use the HSP with the highest bit score to measure homology. Only pairs with E-value <10−6 are included in the graph.

write.table(am_to_dr_to_save,
            file = "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/code/CEREBRO_related/maps/dram/am_to_dr.txt",
            sep = "\t",
            quote = FALSE,
            row.names = FALSE,
            col.names = FALSE)

##dr to am
dr_to_am <- read.table(file = paths[2], sep = "\t", header = FALSE)
colnames(dr_to_am)[1:2] <- c("dr_t", "am_t")

dr_to_am_mu <- dr_to_am %>%
  mutate(am_g = am_keep$Gene_ID[match(am_t, am_keep$Transcript_ID)],
         dr_g = dr_keep$Gene_ID[match(dr_t, dr_keep$Transcript_ID)])

#after replacing the transcript IDs with gene IDs, does the bit score of the same pair change due to multi-mapping between transcripts and gene
dr_to_am_to_save <- dr_to_am_mu %>%
  select(dr_g, am_g, V3, V4,V5, V6, V6, V7, V8, V9, V10, V11, V12)

write.table(dr_to_am_to_save,
            file = "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/code/CEREBRO_related/maps/dram/dr_to_am.txt",
            sep = "\t",
            quote = FALSE,
            row.names = FALSE,
            col.names = FALSE)


## ===== reciprocal best hits ===== ##
##am_to_dr_mu_to_save and dr_to_am_to_save will be the input for SAMap, we will filter the results with E-value (V11) < 0.001, bitscore (V12) > 100, length (V4) > 40, then define best hit as the gene pair with the highest bit score, then the mutual gene pairs are used to construct reciprocal best hits

##am to dr
#read in file
am_to_dr_to_save <- read.table(file = "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/code/CEREBRO_related/maps/dram/am_to_dr.txt", sep = "\t", header = FALSE)
#E-value, length, bitscore cutoff
am_to_dr_k <- am_to_dr_to_save %>%
  filter(V11 < 0.001 & V12 > 100 & V4 > 40)
#keep the one with highest bit score
am_to_dr_k_b <- am_to_dr_k %>%
  group_by(V1) %>%
  filter(V12 == max(V12))
#keep distinct rows in terms of gene pairs * distinct
am_to_dr_k_b_d <- am_to_dr_k_b %>%
  distinct(V1, V2, .keep_all = TRUE)
am_to_dr_k_b_d$to_match <- paste0(am_to_dr_k_b_d$V1, "_", am_to_dr_k_b_d$V2)
rm(am_to_dr_to_save, am_to_dr_k, am_to_dr_k_b)

##dr to am
#read in file
dr_to_am_to_save <- read.table(file = "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/code/CEREBRO_related/maps/dram/dr_to_am.txt", sep = "\t", header = FALSE)
#E-value cutoff
dr_to_am_k <- dr_to_am_to_save %>%
  filter(V11 < 0.001 & V12 > 100 & V4 > 40)
#keep the one with highest bit score
dr_to_am_k_b <- dr_to_am_k %>%
  group_by(V1) %>%
  filter(V12 == max(V12))
#keep distinct rows in terms of gene pairs * distinct
dr_to_am_k_b_d <- dr_to_am_k_b %>%
  distinct(V1, V2, .keep_all = TRUE)
dr_to_am_k_b_d$to_match <- paste0(dr_to_am_k_b_d$V2, "_", dr_to_am_k_b_d$V1)
rm(dr_to_am_to_save, dr_to_am_k, dr_to_am_k_b)

to_keep <- intersect(am_to_dr_k_b_d$to_match, dr_to_am_k_b_d$to_match)
am_to_dr_to_keep <- am_to_dr_k_b_d %>%
  filter(to_match %in% to_keep)
dr_to_am_to_keep <- dr_to_am_k_b_d %>%
  filter(to_match %in% to_keep)

rm(am_to_dr_k_b_d, dr_to_am_k_b_d, to_keep)

final_rbh <- am_to_dr_to_keep %>%
  select(V1, V2)
colnames(final_rbh) <- c("am_gene_ID", "dr_gene_ID")

#add gene symbols
dr_gene_data <- read.delim(file = "/n/analysis/genomes/Danio_rerio/GRCz11/annotation/Ens_110/tables/GRCz11.Ens_110.gene_data.txt",
                           sep = "\t")
##replace the row with empty gene symbol with gene ID in first column
dr_gene_data$Name <- ifelse(dr_gene_data$Name == "", dr_gene_data$Gene_ID, dr_gene_data$Name)

final_rbh_j <- left_join(x = final_rbh, 
                         y = dr_gene_data %>%
                           select(Gene_ID, Name), 
                         by = c("dr_gene_ID" = "Gene_ID"))
colnames(final_rbh_j)[3] <- "dr_gene_symbol"

am_gene_data <- read.delim(file = "/n/analysis/genomes/Astyanax_mexicanus/Astyanax_mexicanus-2.0/annotation/Ens_110/tables/Astyanax_mexicanus-2.0.Ens_110.gene_data.txt",
                           sep = "\t")
##replace the row with empty gene symbol with gene ID in first column
am_gene_data$Name <- ifelse(am_gene_data$Name == "", am_gene_data$Gene_ID, am_gene_data$Name)

final_rbh_jj <- left_join(x = final_rbh_j,
                          y = am_gene_data %>%
                            select(Gene_ID, Name),
                          by = c("am_gene_ID" = "Gene_ID"))
colnames(final_rbh_jj)[4] <- "am_gene_symbol"

final_rbh_to_save <- final_rbh_jj %>%
  select(am_gene_ID, am_gene_symbol, dr_gene_ID, dr_gene_symbol)

write.csv(final_rbh_to_save,
          file = paste0(plot_root, "am_dr_rbh.csv"),
          row.names = FALSE)

am_gene_rbh <- nrow(final_rbh_to_save) / nrow(am_gene_data) * 100
am_gene_rbh
# [1] 60.93727
# 16709 out of 27420

dr_gene_rbh <- nrow(final_rbh_to_save) / nrow(dr_gene_data) * 100
dr_gene_rbh
# [1] 51.38069 16709 out of 32520


#in house rbh pipeline script:
#/n/core/bigDataAI/jr2396/CompBio/mcm/rbh_pipeline/scripts/rbh.R
## ===== save cavefish object as h5ad ===== ##

##load integrated seu obj with some sort of annotations
load(file = "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/data/interdata/analysis_v1/All_cells_integrated_PC30res0.4.RData")
merged.c[["RNA"]] <- split(merged.c[["RNA"]], f = merged.c$sample_name)
DefaultAssay(merged.c) <- "RNA"
merged.c[["SCT"]] <- NULL

##convert a v5 assay to a v3 assay
merged.c[["RNA3"]] <- as(object = merged.c[["RNA"]], Class = "Assay")
DefaultAssay(merged.c) <- "RNA3"
merged.c[["RNA"]] <- NULL
merged.c[["RNA"]] <- merged.c[["RNA3"]]
DefaultAssay(merged.c) <- "RNA"
merged.c[["RNA3"]] <- NULL

##this is for my ocd
merged.c$nFeature_SCT <- NULL
merged.c$nCount_SCT <- NULL

SeuratDisk::SaveH5Seurat(merged.c,
             filename = paste0(inter_data, "am.h5Seurat"),
             overwrite = TRUE)
SeuratDisk::Convert(
  paste0(inter_data, "am.h5Seurat"),
  overwrite = TRUE,
  paste0(inter_data, "am.h5ad")
)
## ===== save cavefish object as h5ad ===== ##

##load integrated seu obj with some sort of annotations
load(file = paste0(inter_data, "all_fish_harmonied_post_SCTmarkers_PC40res.5.RData"))
merged.c <- seu_m.c

merged.c[["RNA"]] <- split(merged.c[["RNA"]], f = merged.c$sample_name)
DefaultAssay(merged.c) <- "RNA"
merged.c[["SCT"]] <- NULL

##convert a v5 assay to a v3 assay
merged.c[["RNA3"]] <- as(object = merged.c[["RNA"]], Class = "Assay")
DefaultAssay(merged.c) <- "RNA3"
merged.c[["RNA"]] <- NULL
merged.c[["RNA"]] <- merged.c[["RNA3"]]
DefaultAssay(merged.c) <- "RNA"
merged.c[["RNA3"]] <- NULL

##this is for my ocd
merged.c$nFeature_SCT <- NULL
merged.c$nCount_SCT <- NULL

SeuratDisk::SaveH5Seurat(merged.c,
             filename = paste0(inter_data, "am2.h5Seurat"),
             overwrite = TRUE)
SeuratDisk::Convert(
  paste0(inter_data, "am2.h5Seurat"),
  overwrite = TRUE,
  paste0(inter_data, "am2.h5ad")
)

Refer to SAMap.html

SAMap results

UMAP

#read in the h5ad file from SAMap: https://github.com/cellgeni/schard
#sceasy and seuratdisk both didn't work
seuobj <- schard::h5ad2seurat(paste0(inter_data, "samap_20240501.h5ad"))

pd <- DimPlot(seuobj,
              group.by = "species",
              pt.size = .1,
              cols = ggsci::pal_jco(alpha = .3)(2)) +
  labs(x = "UMAP_1", y = "UMAP_2") +
  theme(text = element_text(size = 18)) +
  coord_fixed(ratio = 1.2)

png(file = paste0(plot_root, "samap_20240501_umap.png"), width = 800, height = 600)
pd
dev.off()

rm(pd)


Cavefish
#am cluster
curio26 <- readRDS("~/color/curio26.rds")

seuobj_am <- subset(seuobj, subset = species == "am")
#this is for visualization purpose

seuobj_am$am_seurat_clusters <- factor(seuobj_am$am_seurat_clusters, levels = c(paste0("Clu_", 0:25)))
p <- DimPlot(seuobj_am,
             group.by = "am_seurat_clusters",
             pt.size = .6,
             label = TRUE,
             cols = c(curio26, "snow")) +
  labs(x = "UMAP_1", y = "UMAP_2") +
  theme(text = element_text(size = 18)) +
  coord_fixed(ratio = 1.2)

png(file = paste0(plot_root, "samap_20240501_am_cluster.png"), width = 800, height = 600)
p
dev.off()

rm(p)


#split by fish
p <- DimPlot(seuobj_am,
             group.by = "am_seurat_clusters",
             split.by = "am_fish",
             pt.size = 1,
             label = TRUE,
             cols = curio26) +
  labs(x = "UMAP_1", y = "UMAP_2") +
  theme(text = element_text(size = 18)) +
  coord_fixed(ratio = 1.2)

png(filename = paste0(plot_root, "samap_20240501_am_fish_cluster.png"), width = 800*2.5, height = 600)
p
dev.off()



Zebrafish
seuobj_dr <- subset(seuobj, subset = species == "dr")
#this is for visualization purpose

p <- DimPlot(seuobj_dr,
             group.by = "dr_predicted.id",
             pt.size = .4,
             label = TRUE,
             repel = TRUE) +
  labs(x = "UMAP_1", y = "UMAP_2") +
  theme(text = element_text(size = 18)) +
  coord_fixed(ratio = 1)

png(file = paste0(plot_root, "sdrap_20240501_dr_cluster.png"), width = 800, height = 600)
p
dev.off()

rm(p)



Alignment score visualization

#read in the alignment score
mt <- read.csv(file = paste0(inter_data, "MappingTable_20240501.csv"))
rownames(mt) <- mt$X
mt_keep <- mt[mt$X %in% grep("^dr", mt$X, value = TRUE), colnames(mt) %in% grep("^am", mt$X, value = TRUE)]
mt_keep <- mt_keep[ , paste0("am_Clu_", 0:25)]

png(file = paste0(plot_root, "samap20240501_alignscore.png"), width = 500*2.7, height = 200*2.5, res = 100)
corrplot::corrplot(as.matrix(mt_keep),
                   method = "circle",
                   col = corrplot::COL2("PiYG", 200),
                   col.lim = c(0, 1),
                   is.corr = FALSE,
                   outline = TRUE,
                   tl.col = "black",
                   cl.ratio = .1,
                   mar = c(3, 1, 1, 1))
dev.off()



The alignment score is defined as the average number of mutual nearest cross-species neighbors of each cell relative to the maximum possible number of neighbors. It indicates similarity between cell types, the higher the alignment score, the more similar the cell types are.


Annotated UMAP

Processed seurat object can be downloaded from GEO GSE282933.

#load the seu obj from FN

load(file = "/n/projects/fx2482/scRNA-seq-ovary/output/PC40_res0.5_replicatemerge_fishintegrate_harmony/Rdata/new_annotated_all_fish_harmonied_post_SCTmarkers_PC40res.5.RData")
seu <- merged.integrate.40.c
rm(merged.integrate.40.c)

seu$annotation <- seu@active.ident

#check the UMAP
p <- DimPlot(seu,
             group.by = "annotation",
             pt.size = .4,
             label = FALSE,
             label.size = 5,
             label.color = "black",
             repel = TRUE,
             cols = ggsci::pal_d3("category10")(10)) +
  labs(x = "UMAP_1", y = "UMAP_2", title = "") +
  theme(text = element_text(size = 18)) +
  coord_fixed()

png(file = paste0(plot_root, "all_fish_harmonied_PC40res0.5_ann.png"), width = 800, height = 600)
p
dev.off()

rm(p)



Follicular somatic cell subcluster analysis

To manage the effects of noise and potential bias introduced by other cell populations during trajectory inference, we will subset follicular somatic cell (previously annotated as ovarian follicle cells) and perform subcluster analysis.

Consistent with previous analysis, we will merge fish group wise replicates then integrate the three groups by using Harmony.

#load the seurat obj
load(file = "/n/projects/fx2482/scRNA-seq-ovary/output/PC40_res0.5_replicatemerge_fishintegrate_harmony/Rdata/annotated_all_fish_harmonied_post_SCTmarkers_PC40res.5.RData")
seu <- merged.integrate.40.c
rm(merged.integrate.40.c)
seu$annotation <- seu@active.ident

#clean seu metadata
to_rm <- grep(pattern = "sample_name|pANN|SCT|DF|seurat", x = colnames(seu@meta.data), value = TRUE)
for (i in seq_along(to_rm)){
  seu[[to_rm[i]]] <- NULL
}

DefaultAssay(seu) <- "RNA" 
seu_sub <- subset(seu, subset = annotation %in% c("Ovarian follicle"))
seu_sub_split <- SplitObject(seu_sub, split.by = "orig.ident")

seu_sub_split <- lapply(X = seu_sub_split,
                        FUN = function(x) {
                          x <- SCTransform(
                            x,
                            vst.flavor = "v2",
                            variable.features.n = 3000,
                            return.only.var.genes = FALSE
                          )
                          return(x)
                        })

#we will merge fish group wise replicates then integrate the three groups
## ===== sf ===== ##
sf_m <- merge(seu_sub_split$Surface_fish_1, y = seu_sub_split$Surface_fish_2)
sf_m %<>% PrepSCTFindMarkers(assay = "SCT")
VariableFeatures(sf_m) <- c(VariableFeatures(seu_sub_split$Surface_fish_1),
                            VariableFeatures(seu_sub_split$Surface_fish_2))

## ===== pa ===== ##
pa_m <- merge(seu_sub_split$Pachon_1, y = seu_sub_split$Pachon_2)
pa_m %<>% PrepSCTFindMarkers(assay = "SCT")
VariableFeatures(pa_m) <- c(VariableFeatures(seu_sub_split$Pachon_1),
                            VariableFeatures(seu_sub_split$Pachon_2))

## ===== mo ===== ##
mo_m <- merge(seu_sub_split$Molino_1, y = seu_sub_split$Molino_2)
mo_m %<>% PrepSCTFindMarkers(assay = "SCT")
VariableFeatures(mo_m) <- c(VariableFeatures(seu_sub_split$Molino_1),
                            VariableFeatures(seu_sub_split$Molino_2))

save(sf_m, pa_m, mo_m, file = paste0(inter_data, "All_fish_ovarian_follicle_merged.RData"))

library(harmony)
#integration using Harmony
all_merge <- list(sf_m, pa_m, mo_m)
names(all_merge) <- c("Surface_fish", "Pachon", "Molino")

var_features <- SelectIntegrationFeatures(object.list = all_merge, nfeatures = 3000)

seu_m <- merge(all_merge[[1]],
               y = all_merge[2:length(all_merge)],
               merge.data = TRUE)

VariableFeatures(seu_m) <- var_features
seu_m %<>%
  RunPCA() %>%
  RunHarmony(assay.use = "SCT",
             group.by.vars = "fish")

save(seu_m, file = paste0(inter_data, "All_fish_ovarian_follicle_harmonied.RData"))

#elbow plot
p <- ElbowPlot(seu_m, ndims = 50) +
  ggtitle(label = "Elbow plot of PCs")

png(file = paste0(plot_root, "ovarian_follicle_elbow.png"), width = 800, height = 400)
print(p)
dev.off()



#explore PCs and res
i_range <- c(20, 30, 40)
j_range <- seq(from = 0.2, to = 0.8, by = 0.2)

plots_list <- list()
for (i in i_range){
  for (j in j_range){
    set.seed(10086)
    
    seu_m.c <- seu_m %>%
      FindNeighbors(dims = 1:i, reduction = "harmony") %>%
      FindClusters(resolution = j) %>%
      RunUMAP(dims = 1:i, reduction = "harmony")
    
    plot_obj <- DimPlot(seu_m.c, reduction = "umap", raster = FALSE, label = TRUE, repel = TRUE) +
      labs(caption = paste0("PC = ", as.character(i), " res = ", as.character(j)))
    plots_list[[paste0("PC", i, "res", j)]] <- plot_obj
  }
}

png(filename = paste0(plot_root, "all_fish_ovarian_follicle_harmonied_PCres.png"), width = 600*length(j_range), height = 500*length(i_range))
wrap_plots(plots_list, ncol = length(j_range))
dev.off()

rm(plot_obj, plots_list, i, j, i_range, j_range)


According to elbow plot and UMAPs with different PC and res combinations, we will choose PC = 40, res = 0.8 for downstream analysis.


PC 40 res 0.8

load(file = paste0(inter_data, "All_fish_ovarian_follicle_harmonied.RData"))

set.seed(10086)

i <- 40
j <- 0.8

seu_m.c <- seu_m %>%
  FindNeighbors(dims = 1:i, reduction = "harmony") %>%
  FindClusters(resolution = j) %>%
  RunUMAP(dims = 1:i, reduction = "harmony")

save(seu_m.c, file = paste0(inter_data, "All_fish_ovarian_follicle_sub_PC40res0.8.RData"))

color19_nord <- readRDS("~/color/color19_nord.rds")

ph <- DimPlot(seu_m.c, label = TRUE, label.box = TRUE, repel = TRUE, raster = FALSE, cols = color19_nord, pt.size = .5) +
  NoLegend() +
  coord_fixed()

png(filename = paste0(plot_root, "ovarian_follicle_PC40res0.8.png"), width = 800, height = 600)
ph
dev.off()

rm(ph)



load(file = paste0(inter_data, "All_fish_ovarian_follicle_sub_PC40res0.8.RData"))

seu_m.c$fish <- factor(seu_m.c$fish, levels = c("Surface_fish", "Pachon", "Molino"))
ph <- DimPlot(seu_m.c, label = TRUE, label.box = FALSE, repel = TRUE, raster = FALSE, cols = color19_nord, pt.size = 1.2, split.by = "fish") +
  NoLegend() +
  coord_fixed()
png(filename = paste0(plot_root, "ovarian_follicle_PC40res0.8_fish.png"), width = 800*3, height = 600)
print(ph)
dev.off()



#lhx9 is the marker of starting population cells
pf <- scCustomize::FeaturePlot_scCustom(seu_m.c,
                                        features = "lhx9",
                                        pt.size = .5) +
  coord_fixed()

png(filename = paste0(plot_root, "ovarian_follicle_PC40res0.8_lhx9.png"), width = 800, height = 600)
print(pf)
dev.off()



Trajectory analysis

Trajectory inference figure of zebrafish paper

library(monocle3)
library(SeuratWrappers)

#load seu obj
load(file = paste0(inter_data, "All_fish_ovarian_follicle_sub_PC40res0.8.RData"))

seu <- seu_m.c

seu %<>% PrepSCTFindMarkers(assay = "SCT")

cds <- seu %>%
  as.cell_data_set(assay = "SCT",
                   reduction = "umap") %>% #harmonied UMAP and SCT assay will be used for trajectory inference
  cluster_cells(reduction_method = "UMAP",
                cluster_method = "louvain") %>%
  learn_graph(use_partition = FALSE)

save(cds, file = paste0(inter_data, "All_fish_ovarian_follicle_sub_PC40res0.8_cds.RData"))

#visualize Principle points
plot_cells(cds, color_cells_by = "partition")

p <- plot_cells(cds,
           label_principal_points = TRUE,
           color_cells_by = "cluster",
           trajectory_graph_segment_size = 1,
           cell_size = 0.65,
           cell_stroke = 0.5,
           graph_label_size = 3)

png(file = paste0(plot_root, "monocle3_ovarian_follicle_sub_pick_root.png"), width = 800, height = 600)
p
dev.off()



Y_88 will be set as the root cell for the trajectory inference.

cds %<>%
  order_cells(reduction_method = "UMAP",
              root_pr_nodes = "Y_88")

save(cds, file = paste0(inter_data, "All_fish_ovarian_follicle_sub_PC40res0.8_cds_ordered.RData"))

p <- plot_cells(cds,
           color_cells_by = "pseudotime",
           label_cell_groups = FALSE,
           label_leaves = FALSE,
           label_roots = FALSE,
           label_branch_points = FALSE,
           show_trajectory_graph = TRUE,
           trajectory_graph_segment_size = .75,
           cell_size = 0.5,
           cell_stroke = 0.5,
           graph_label_size = 3) +
  viridis::scale_color_viridis(option = "plasma",
                               end = 1,
                               direction = 1) +
  coord_fixed() +
  guides(color = guide_colorbar(title = "Pseudotime"))

png(file = paste0(plot_root, "monocle3_ovarian_follicle_sub_pseudotime.png"), width = 1000, height = 600, res = 100)
p
dev.off()



#split by fish visualization
p <- plot_cells(cds,
           color_cells_by = "pseudotime",
           label_cell_groups = FALSE,
           label_leaves = FALSE,
           label_roots = FALSE,
           label_branch_points = FALSE,
           show_trajectory_graph = TRUE,
           trajectory_graph_segment_size = .75,
           cell_size = 0.5,
           cell_stroke = 0.5,
           graph_label_size = 3) +
  viridis::scale_color_viridis(option = "plasma",
                               end = 1,
                               direction = 1) +
  coord_fixed() +
  guides(color = guide_colorbar(title = "Pseudotime")) +
  facet_wrap(~fish, ncol = 1)

png(file = paste0(plot_root, "monocle3_ovarian_follicle_sub_pseudotime_fish.png"), width = 1200, height = 600*3, res = 110)
print(p)
dev.off()



#updated 2024 oct
#fanning wants three separate figure for the three fish group
load(file = paste0(inter_data, "All_fish_ovarian_follicle_sub_PC40res0.8_cds_ordered.RData"))

#for title
facet_titles <- c("Surface_fish" = "Surface fish", "Pachon" = "Pachón", "Molino" = "Molino")

p <- plot_cells(cds,
           color_cells_by = "pseudotime",
           label_cell_groups = FALSE,
           label_leaves = FALSE,
           label_roots = FALSE,
           label_branch_points = FALSE,
           show_trajectory_graph = TRUE,
           trajectory_graph_segment_size = .9,
           trajectory_graph_color = "black",
           cell_size = 0.5,
           cell_stroke = 0.5,
           graph_label_size = 3) +
  viridis::scale_color_viridis(option = "plasma",
                               end = 1,
                               direction = 1) +
  coord_fixed() +
  guides(color = guide_colorbar(title = "Pseudotime"))

# Loop through each page and save as a separate image
n_pages <- 3
for (i in 1:n_pages) {
  paginated_plot <- p +
    ggforce::facet_wrap_paginate(
      ~ fish,
      nrow = 1,
      ncol = 1,
      page = i,
      labeller = labeller(fish = facet_titles)
    ) +
    theme(strip.text = element_text(size = 17),
          axis.title.x = element_text(size = 12),
          axis.title.y = element_text(size = 12),
          legend.title = element_text(size = 13))
  
  # Save each page
  ggsave(paste0(plot_root, "facet_plot_page_", i, ".png"), paginated_plot, width = 8*.8, height = 4.5*.8)
}
p <- plot_cells(cds,
                color_cells_by = "pseudotime",
                label_cell_groups = FALSE,
                label_leaves = FALSE,
                label_roots = TRUE,
                label_branch_points = TRUE,
                show_trajectory_graph = TRUE,
                trajectory_graph_segment_size = .75,
                cell_size = 0.5,
                cell_stroke = 0.5,
                graph_label_size = 3) +
  viridis::scale_color_viridis(option = "plasma",
                               end = 1,
                               direction = 1) +
  coord_fixed() +
  guides(color = guide_colorbar(title = "Pseudotime"))

png(file = paste0(plot_root, "monocle3_ovarian_follicle_sub_branch.png"), width = 1000, height = 600, res = 100)
print(p)
dev.off()



p <- plot_cells(cds,
           color_cells_by = "pseudotime",
           label_cell_groups = FALSE,
           label_leaves = TRUE,
           label_roots = TRUE,
           label_branch_points = TRUE,
           show_trajectory_graph = TRUE,
           trajectory_graph_segment_size = .75,
           cell_size = 0.5,
           cell_stroke = 0.5,
           graph_label_size = 3) +
  viridis::scale_color_viridis(option = "plasma",
                               end = 1,
                               direction = 1) +
  coord_fixed() +
  guides(color = guide_colorbar(title = "Pseudotime"))

png(file = paste0(plot_root, "monocle3_ovarian_follicle_sub_leaves.png"), width = 1000, height = 600, res = 100)
print(p)
dev.off()



Note: The black lines show the structure of the graph. The circles with numbers in them denote special points within the graph. Each leaf, denoted by light gray circles, corresponds to a different outcome (i.e. cell fate) of the trajectory. Black circles indicate branch nodes, in which cells can travel to one of several outcomes. The white circle with number 1 in it indicates the root.


#density plot

#extract df from metadata so we get cell_id - fish
df_p <- as.data.frame(colData(cds))
df_p %<>% rownames_to_column(var = "cell_id")

pseu_time <- pseudotime(cds)
pseu_time_df <- data.frame(pseu_time)
pseu_time_df %<>% rownames_to_column(var = "cell_id")

df_p_j <- left_join(pseu_time_df, df_p, by = "cell_id")

p <- ggplot(df_p_j, aes(x = pseu_time, fill = fish, color = fish)) +
  geom_density() +
  theme_minimal() +
  ggsci::scale_fill_jco(alpha = .5) +
  ggsci::scale_color_jco(alpha = .6) +
  ggtitle("Density of pseudotime group by fish") +
  xlab("Pseudotime") +
  ylab("Density") +
  theme(text = element_text(size = 18))

png(file = paste0(plot_root, "monocle3_ovarian_follicle_sub_pseudotime_density.png"), width = 800, height = 400)
print(p)
dev.off()



#https://rpubs.com/mahima_bose/Seurat_and_Monocle3_p
rowData(cds)$gene_name <- rownames(cds)
rowData(cds)$gene_short_name <- rowData(cds)$gene_name

cds$Pseudotime <- pseudotime(cds)

plot_fun <- function(gene){
  for_plot_cds <- cds[gene, ]
  p <- monocle3::plot_genes_in_pseudotime(for_plot_cds,
                                          color_cells_by = "pseudotime") +
    theme(text = element_text(size = 20))
  png(filename = paste0(plot_root, "monocle3_ovarian_follicle_sub_", gene, "_pseu.png"), width = 800, height = 400, res = 100)
  print(p)
  dev.off()
}

to_add <- c("wt1a", "gsdf", "fshr", "cyp11a1.1")#"lhcrg"
for (i in seq_along(to_add)){
  plot_fun(to_add[i])
}

for_plot_cds <- cds["lhx9", ]
p <- monocle3::plot_genes_in_pseudotime(for_plot_cds,
                                        color_cells_by = "pseudotime") +
  theme(text = element_text(size = 20))

png(filename = paste0(plot_root, "monocle3_ovarian_follicle_sub_lhx9_pseu.png"), width = 800, height = 400, res = 100)
print(p)
dev.off()
to_add <- c("lhx9", "wt1a", "gsdf", "fshr", "cyp11a1.1", "lhcgr")
plot_root <- "/n/core/Bioinformatics/analysis/Rohner/fx2482/cbio.fx2482.100/results/plots/analysis_v1/"

for (i in seq_along(to_add)){
  cat(paste0("![](", plot_root, "monocle3_ovarian_follicle_sub_", to_add[i], "_pseu.png)\n\n"))
}


p <- plot_cells(
  cds,
  genes = "lhx9",
  label_cell_groups = FALSE,
  show_trajectory_graph = TRUE,
  label_leaves = FALSE,
  label_branch_points = FALSE,
  label_roots = FALSE,
  cell_size = 0.4,
  cell_stroke = .6,
  alpha = 0.8
) +
  coord_fixed() +
  theme(text = element_text(size = 18),
        legend.key.size = unit(.5, "cm"),
        legend.text = element_text(size = 10), 
        legend.title = element_text(size = 18))

png(file = paste0(plot_root, "monocle3_ovarian_follicle_sub_lhx9.png"), width = 1000, height = 600, res = 100)
print(p)
dev.off()


Session information

This report was knitted on 2024-12-12.

library(tidyverse) #tidy data
library(dplyr) #tidy data 
library(Seurat) #scRNAseq infrastructure
library(zeallot) #assignment of pipe
library(magrittr) #assignment of pipe
library(ggplot2) #plots
library(ggthemes) #plots
library(scales) #plots
library(patchwork) #plots
library(harmony)
library(monocle3)
library(DoubletFinder)
library(SeuratWrappers)

devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.3.1 (2023-06-16)
##  os       Rocky Linux 8.9 (Green Obsidian)
##  system   x86_64, linux-gnu
##  ui       X11
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       America/Chicago
##  date     2024-12-12
##  pandoc   3.1.1 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package              * version   date (UTC) lib source
##  abind                  1.4-8     2024-09-12 [1] CRAN (R 4.3.1)
##  Biobase              * 2.62.0    2023-10-24 [1] Bioconductor
##  BiocGenerics         * 0.48.1    2023-11-01 [1] Bioconductor
##  BiocManager            1.30.22   2023-08-08 [2] CRAN (R 4.3.1)
##  bitops                 1.0-9     2024-10-03 [1] CRAN (R 4.3.1)
##  boot                   1.3-28.1  2022-11-22 [2] CRAN (R 4.3.1)
##  bslib                  0.8.0     2024-07-29 [1] CRAN (R 4.3.1)
##  cachem                 1.1.0     2024-05-16 [1] CRAN (R 4.3.1)
##  callr                  3.7.3     2022-11-02 [2] CRAN (R 4.3.1)
##  cellranger             1.1.0     2016-07-27 [2] CRAN (R 4.3.1)
##  cli                    3.6.3     2024-06-21 [1] CRAN (R 4.3.1)
##  cluster                2.1.4     2022-08-22 [2] CRAN (R 4.3.1)
##  codetools              0.2-19    2023-02-01 [2] CRAN (R 4.3.1)
##  colorspace             2.1-1     2024-07-26 [2] CRAN (R 4.3.1)
##  cowplot                1.1.3     2024-01-22 [1] CRAN (R 4.3.1)
##  crayon                 1.5.3     2024-06-20 [1] CRAN (R 4.3.1)
##  data.table             1.16.2    2024-10-10 [1] CRAN (R 4.3.1)
##  DelayedArray           0.28.0    2023-10-24 [1] Bioconductor
##  deldir                 2.0-4     2024-02-28 [1] CRAN (R 4.3.1)
##  devtools               2.4.5     2022-10-11 [2] CRAN (R 4.3.1)
##  digest                 0.6.37    2024-08-19 [1] CRAN (R 4.3.1)
##  dotCall64              1.2       2024-10-04 [1] CRAN (R 4.3.1)
##  DoubletFinder        * 2.0.4     2024-01-31 [1] Github (chris-mcginnis-ucsf/DoubletFinder@853e2de)
##  dplyr                * 1.1.4     2023-11-17 [1] CRAN (R 4.3.1)
##  ellipsis               0.3.2     2021-04-29 [2] CRAN (R 4.3.1)
##  evaluate               1.0.1     2024-10-10 [1] CRAN (R 4.3.1)
##  fansi                  1.0.6     2023-12-08 [1] CRAN (R 4.3.1)
##  farver                 2.1.2     2024-05-13 [1] CRAN (R 4.3.1)
##  fastDummies            1.7.3     2023-07-06 [2] CRAN (R 4.3.1)
##  fastmap                1.2.0     2024-05-15 [1] CRAN (R 4.3.1)
##  fitdistrplus           1.1-11    2023-04-25 [2] CRAN (R 4.3.1)
##  forcats              * 1.0.0     2023-01-29 [2] CRAN (R 4.3.1)
##  fs                     1.6.5     2024-10-30 [1] CRAN (R 4.3.1)
##  future                 1.34.0    2024-07-29 [1] CRAN (R 4.3.1)
##  future.apply           1.11.3    2024-10-27 [1] CRAN (R 4.3.1)
##  generics               0.1.3     2022-07-05 [2] CRAN (R 4.3.1)
##  GenomeInfoDb         * 1.38.8    2024-03-15 [1] Bioconductor 3.18 (R 4.3.1)
##  GenomeInfoDbData       1.2.11    2023-12-04 [1] Bioconductor
##  GenomicRanges        * 1.54.1    2023-10-29 [1] Bioconductor
##  ggplot2              * 3.5.1     2024-04-23 [1] CRAN (R 4.3.1)
##  ggrepel                0.9.6     2024-09-07 [1] CRAN (R 4.3.1)
##  ggridges               0.5.6     2024-01-23 [1] CRAN (R 4.3.1)
##  ggthemes             * 4.2.4     2021-01-20 [2] CRAN (R 4.3.1)
##  globals                0.16.3    2024-03-08 [1] CRAN (R 4.3.1)
##  glue                   1.8.0     2024-09-30 [1] CRAN (R 4.3.1)
##  goftest                1.2-3     2021-10-07 [2] CRAN (R 4.3.1)
##  gridExtra              2.3       2017-09-09 [2] CRAN (R 4.3.1)
##  gtable                 0.3.6     2024-10-25 [1] CRAN (R 4.3.1)
##  harmony              * 1.2.1     2024-08-27 [1] CRAN (R 4.3.1)
##  hms                    1.1.3     2023-03-21 [2] CRAN (R 4.3.1)
##  htmltools              0.5.8.1   2024-04-04 [1] CRAN (R 4.3.1)
##  htmlwidgets            1.6.4     2023-12-06 [1] CRAN (R 4.3.1)
##  httpuv                 1.6.15    2024-03-26 [1] CRAN (R 4.3.1)
##  httr                   1.4.7     2023-08-15 [2] CRAN (R 4.3.1)
##  ica                    1.0-3     2022-07-08 [2] CRAN (R 4.3.1)
##  igraph                 2.0.3     2024-03-13 [1] CRAN (R 4.3.1)
##  IRanges              * 2.36.0    2023-10-24 [1] Bioconductor
##  irlba                  2.3.5.1   2022-10-03 [1] CRAN (R 4.3.1)
##  jquerylib              0.1.4     2021-04-26 [2] CRAN (R 4.3.1)
##  jsonlite               1.8.9     2024-09-20 [1] CRAN (R 4.3.1)
##  KernSmooth             2.23-22   2023-07-10 [2] CRAN (R 4.3.1)
##  knitr                  1.48      2024-07-07 [1] CRAN (R 4.3.1)
##  later                  1.3.2     2023-12-06 [1] CRAN (R 4.3.1)
##  lattice                0.21-8    2023-04-05 [2] CRAN (R 4.3.1)
##  lazyeval               0.2.2     2019-03-15 [2] CRAN (R 4.3.1)
##  leiden                 0.4.3.1   2023-11-17 [1] CRAN (R 4.3.1)
##  lifecycle              1.0.4     2023-11-07 [1] CRAN (R 4.3.1)
##  listenv                0.9.1     2024-01-29 [1] CRAN (R 4.3.1)
##  lme4                   1.1-35.5  2024-07-03 [1] CRAN (R 4.3.1)
##  lmtest                 0.9-40    2022-03-21 [2] CRAN (R 4.3.1)
##  lubridate            * 1.9.3     2023-09-27 [1] CRAN (R 4.3.1)
##  magrittr             * 2.0.3     2022-03-30 [2] CRAN (R 4.3.1)
##  MASS                   7.3-60    2023-05-04 [2] CRAN (R 4.3.1)
##  Matrix                 1.6-5     2024-01-11 [1] CRAN (R 4.3.1)
##  MatrixGenerics       * 1.14.0    2023-10-24 [1] Bioconductor
##  matrixStats          * 1.4.1     2024-09-08 [1] CRAN (R 4.3.1)
##  memoise                2.0.1     2021-11-26 [2] CRAN (R 4.3.1)
##  mime                   0.12      2021-09-28 [2] CRAN (R 4.3.1)
##  miniUI                 0.1.1.1   2018-05-18 [2] CRAN (R 4.3.1)
##  minqa                  1.2.8     2024-08-17 [1] CRAN (R 4.3.1)
##  monocle3             * 1.3.4     2023-12-05 [1] Github (cole-trapnell-lab/monocle3@2b17745)
##  munsell                0.5.1     2024-04-01 [1] CRAN (R 4.3.1)
##  nlme                   3.1-163   2023-08-09 [2] CRAN (R 4.3.1)
##  nloptr                 2.0.3     2022-05-26 [2] CRAN (R 4.3.1)
##  parallelly             1.38.0    2024-07-27 [1] CRAN (R 4.3.1)
##  patchwork            * 1.3.0     2024-09-16 [1] CRAN (R 4.3.1)
##  pbapply                1.7-2     2023-06-27 [2] CRAN (R 4.3.1)
##  pillar                 1.9.0     2023-03-22 [2] CRAN (R 4.3.1)
##  pkgbuild               1.4.2     2023-06-26 [2] CRAN (R 4.3.1)
##  pkgconfig              2.0.3     2019-09-22 [2] CRAN (R 4.3.1)
##  pkgload                1.4.0     2024-06-28 [1] CRAN (R 4.3.1)
##  plotly                 4.10.4    2024-01-13 [1] CRAN (R 4.3.1)
##  plyr                   1.8.9     2023-10-02 [1] CRAN (R 4.3.1)
##  png                    0.1-8     2022-11-29 [2] CRAN (R 4.3.1)
##  polyclip               1.10-7    2024-07-23 [1] CRAN (R 4.3.1)
##  prettyunits            1.2.0     2023-09-24 [1] CRAN (R 4.3.1)
##  processx               3.8.2     2023-06-30 [2] CRAN (R 4.3.1)
##  profvis                0.3.8     2023-05-02 [2] CRAN (R 4.3.1)
##  progressr              0.15.0    2024-10-29 [2] CRAN (R 4.3.1)
##  promises               1.3.0     2024-04-05 [1] CRAN (R 4.3.1)
##  ps                     1.7.5     2023-04-18 [2] CRAN (R 4.3.1)
##  purrr                * 1.0.2     2023-08-10 [2] CRAN (R 4.3.1)
##  R.methodsS3            1.8.2     2022-06-13 [2] CRAN (R 4.3.1)
##  R.oo                   1.25.0    2022-06-12 [2] CRAN (R 4.3.1)
##  R.utils                2.12.2    2022-11-11 [2] CRAN (R 4.3.1)
##  R6                     2.5.1     2021-08-19 [2] CRAN (R 4.3.1)
##  RANN                   2.6.1     2019-01-08 [2] CRAN (R 4.3.1)
##  RColorBrewer           1.1-3     2022-04-03 [2] CRAN (R 4.3.1)
##  Rcpp                 * 1.0.13-1  2024-11-02 [1] CRAN (R 4.3.1)
##  RcppAnnoy              0.0.22    2024-01-23 [1] CRAN (R 4.3.1)
##  RcppHNSW               0.6.0     2024-02-04 [1] CRAN (R 4.3.1)
##  RCurl                  1.98-1.14 2024-01-09 [1] CRAN (R 4.3.1)
##  readr                * 2.1.5     2024-01-10 [1] CRAN (R 4.3.1)
##  readxl                 1.4.3     2023-07-06 [2] CRAN (R 4.3.1)
##  remotes                2.4.2.1   2023-07-18 [2] CRAN (R 4.3.1)
##  reshape2               1.4.4     2020-04-09 [2] CRAN (R 4.3.1)
##  reticulate             1.37.0    2024-05-21 [1] CRAN (R 4.3.1)
##  rlang                  1.1.4     2024-06-04 [1] CRAN (R 4.3.1)
##  rmarkdown              2.29      2024-11-04 [1] CRAN (R 4.3.1)
##  ROCR                   1.0-11    2020-05-02 [2] CRAN (R 4.3.1)
##  RSpectra               0.16-1    2022-04-24 [2] CRAN (R 4.3.1)
##  rstudioapi             0.17.1    2024-10-22 [1] CRAN (R 4.3.1)
##  rsvd                   1.0.5     2021-04-16 [2] CRAN (R 4.3.1)
##  Rtsne                  0.17      2023-12-07 [1] CRAN (R 4.3.1)
##  S4Arrays               1.2.1     2024-03-04 [1] Bioconductor 3.18 (R 4.3.1)
##  S4Vectors            * 0.40.2    2023-11-23 [1] Bioconductor 3.18 (R 4.3.1)
##  sass                   0.4.9     2024-03-15 [1] CRAN (R 4.3.1)
##  scales               * 1.3.0     2023-11-28 [1] CRAN (R 4.3.1)
##  scattermore            1.2       2023-06-12 [2] CRAN (R 4.3.1)
##  sctransform            0.4.1     2023-10-19 [2] CRAN (R 4.3.1)
##  sessioninfo            1.2.2     2021-12-06 [2] CRAN (R 4.3.1)
##  Seurat               * 5.1.0     2024-05-10 [1] CRAN (R 4.3.1)
##  SeuratObject         * 5.0.2     2024-05-08 [1] CRAN (R 4.3.1)
##  SeuratWrappers       * 0.3.4     2024-03-04 [1] Github (satijalab/seurat-wrappers@d9594f6)
##  shiny                  1.9.1     2024-08-01 [1] CRAN (R 4.3.1)
##  SingleCellExperiment * 1.24.0    2024-04-15 [1] bioc_xgit (@2fd8e49)
##  sp                   * 2.1-4     2024-04-30 [1] CRAN (R 4.3.1)
##  spam                   2.11-0    2024-10-03 [1] CRAN (R 4.3.1)
##  SparseArray            1.2.4     2024-02-11 [1] Bioconductor 3.18 (R 4.3.1)
##  spatstat.data          3.1-2     2024-06-21 [1] CRAN (R 4.3.1)
##  spatstat.explore       3.2-7     2024-03-21 [1] CRAN (R 4.3.1)
##  spatstat.geom          3.2-9     2024-02-28 [1] CRAN (R 4.3.1)
##  spatstat.random        3.2-3     2024-02-29 [1] CRAN (R 4.3.1)
##  spatstat.sparse        3.1-0     2024-06-21 [1] CRAN (R 4.3.1)
##  spatstat.utils         3.0-5     2024-06-17 [1] CRAN (R 4.3.1)
##  stringi                1.8.4     2024-05-06 [1] CRAN (R 4.3.1)
##  stringr              * 1.5.1     2023-11-14 [1] CRAN (R 4.3.1)
##  SummarizedExperiment * 1.32.0    2023-10-24 [1] Bioconductor
##  survival               3.5-7     2023-08-14 [2] CRAN (R 4.3.1)
##  tensor                 1.5       2012-05-05 [2] CRAN (R 4.3.1)
##  terra                  1.7-83    2024-10-14 [1] CRAN (R 4.3.1)
##  tibble               * 3.2.1     2023-03-20 [2] CRAN (R 4.3.1)
##  tidyr                * 1.3.1     2024-01-24 [1] CRAN (R 4.3.1)
##  tidyselect             1.2.1     2024-03-11 [1] CRAN (R 4.3.1)
##  tidyverse            * 2.0.0     2023-02-22 [2] RSPM (R 4.3.0)
##  timechange             0.3.0     2024-01-18 [1] CRAN (R 4.3.1)
##  tzdb                   0.4.0     2023-05-12 [2] CRAN (R 4.3.1)
##  urlchecker             1.0.1     2021-11-30 [2] CRAN (R 4.3.1)
##  usethis                2.2.2     2023-07-06 [2] CRAN (R 4.3.1)
##  utf8                   1.2.4     2023-10-22 [1] CRAN (R 4.3.1)
##  uwot                   0.2.2     2024-04-21 [1] CRAN (R 4.3.1)
##  vctrs                  0.6.5     2023-12-01 [1] CRAN (R 4.3.1)
##  viridisLite            0.4.2     2023-05-02 [2] CRAN (R 4.3.1)
##  withr                  3.0.2     2024-10-28 [1] CRAN (R 4.3.1)
##  xfun                   0.44      2024-05-15 [1] CRAN (R 4.3.1)
##  xtable                 1.8-4     2019-04-21 [2] CRAN (R 4.3.1)
##  XVector                0.42.0    2023-10-24 [1] Bioconductor
##  yaml                   2.3.10    2024-07-26 [1] CRAN (R 4.3.1)
##  zeallot              * 0.1.0     2018-01-28 [1] CRAN (R 4.3.1)
##  zlibbioc               1.48.2    2024-03-13 [1] Bioconductor 3.18 (R 4.3.1)
##  zoo                    1.8-12    2023-04-13 [2] CRAN (R 4.3.1)
## 
##  [1] /home/dw2733/R/HPC/x86_64-pc-linux-gnu-library/4.3.1
##  [2] /opt/apps/dev/R/library/4.3.1
##  [3] /opt/R/4.3.1/lib/R/library
## 
## ──────────────────────────────────────────────────────────────────────────────